import pandas as pd
import numpy as np
import plotly.express as px
from datasets import load_dataset
from sklearn.preprocessing import StandardScaler
#Ensure that plots from plotly will work in the html file
import plotly.io as pio
pio.renderers.default='notebook'
pd.set_option("display.max_rows", None, "display.max_columns", None)
dataset = load_dataset("james-burton/imdb_genre_prediction")
df = pd.DataFrame(dataset["train"])
df.dropna(inplace=True)
df.head()
| Rank | Title | Description | Director | Actors | Year | Runtime (Minutes) | Rating | Votes | Revenue (Millions) | Metascore | Genre_is_Drama | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 645 | Viral | Following the outbreak of a virus that wipes o... | Henry Joost | Sofia Black-D'Elia, Analeigh Tipton,Travis Top... | 2016 | 85 | 5.5 | 3564 | 82.877094 | 72.000000 | 1 |
| 1 | 693 | Genius | A chronicle of Max Perkins's time as the book ... | Michael Grandage | Colin Firth, Jude Law, Nicole Kidman, Laura Li... | 2016 | 104 | 6.5 | 10708 | 1.360000 | 58.576613 | 1 |
| 2 | 909 | Slither | A small town is taken over by an alien plague,... | James Gunn | Nathan Fillion, Elizabeth Banks, Michael Rooke... | 2006 | 95 | 6.5 | 64351 | 7.770000 | 69.000000 | 0 |
| 3 | 204 | Iron Man | After being held captive in an Afghan cave, bi... | Jon Favreau | Robert Downey Jr., Gwyneth Paltrow, Terrence H... | 2008 | 126 | 7.9 | 737719 | 318.300000 | 79.000000 | 0 |
| 4 | 73 | A Monster Calls | A boy seeks the help of a tree monster to cope... | J.A. Bayona | Lewis MacDougall, Sigourney Weaver, Felicity J... | 2016 | 108 | 7.5 | 39134 | 3.730000 | 76.000000 | 1 |
print(df.columns)
print(f"Duplicated: {df.duplicated(subset=['Title']).sum()}")
df.loc[df.duplicated(subset=["Title"])]
# df.drop_duplicates(inplace=True)
Index(['Rank', 'Title', 'Description', 'Director', 'Actors', 'Year',
'Runtime (Minutes)', 'Rating', 'Votes', 'Revenue (Millions)',
'Metascore', 'Genre_is_Drama'],
dtype='object')
Duplicated: 1
| Rank | Title | Description | Director | Actors | Year | Runtime (Minutes) | Rating | Votes | Revenue (Millions) | Metascore | Genre_is_Drama | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 610 | 633 | The Host | A monster emerges from Seoul's Han River and f... | Bong Joon Ho | Kang-ho Song, Hee-Bong Byun, Hae-il Park, Doon... | 2006 | 120 | 7.0 | 73491 | 2.2 | 85.0 | 1 |
# Checking the duplicated row
df.query("Title == 'The Host'")
| Rank | Title | Description | Director | Actors | Year | Runtime (Minutes) | Rating | Votes | Revenue (Millions) | Metascore | Genre_is_Drama | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 356 | 240 | The Host | When an unseen enemy threatens mankind by taki... | Andrew Niccol | Saoirse Ronan, Max Irons, Jake Abel, Diane Kruger | 2013 | 125 | 5.9 | 96852 | 26.62 | 35.0 | 0 |
| 610 | 633 | The Host | A monster emerges from Seoul's Han River and f... | Bong Joon Ho | Kang-ho Song, Hee-Bong Byun, Hae-il Park, Doon... | 2006 | 120 | 7.0 | 73491 | 2.20 | 85.0 | 1 |
Since they are from different directors and years, we not considering them as duplicated rows
# df.info()
df.describe()
| Rank | Year | Runtime (Minutes) | Rating | Votes | Revenue (Millions) | Metascore | Genre_is_Drama | |
|---|---|---|---|---|---|---|---|---|
| count | 680.000000 | 680.000000 | 680.000000 | 680.000000 | 6.800000e+02 | 680.000000 | 680.000000 | 680.000000 |
| mean | 493.772059 | 2012.827941 | 113.379412 | 6.753676 | 1.676742e+05 | 83.833422 | 58.865702 | 0.520588 |
| std | 288.989300 | 3.219251 | 18.652157 | 0.925049 | 1.818064e+05 | 100.015538 | 16.461824 | 0.499944 |
| min | 1.000000 | 2006.000000 | 73.000000 | 2.700000 | 6.100000e+01 | 0.000000 | 18.000000 | 0.000000 |
| 25% | 245.750000 | 2010.000000 | 100.000000 | 6.200000 | 3.445425e+04 | 14.845000 | 48.000000 | 0.000000 |
| 50% | 483.000000 | 2014.000000 | 111.000000 | 6.800000 | 1.111915e+05 | 58.645000 | 58.576613 | 1.000000 |
| 75% | 741.250000 | 2016.000000 | 124.000000 | 7.400000 | 2.405820e+05 | 100.012500 | 71.000000 | 1.000000 |
| max | 999.000000 | 2016.000000 | 187.000000 | 8.800000 | 1.583625e+06 | 936.630000 | 100.000000 | 1.000000 |
for column in df.columns:
fig = px.histogram(df, x=column)
fig.show()
From the above graphics, we realize that the rating are concentred around 6 and 8. This unballanced distribution can make our model predict all the ratings in these range in order to minimize the loss instead of prioritize learning. The ideal scenario is where the ratings follow a uniform distribution and then we do not have a bias inserted in our dataset.
Getting insights¶
Since the formula for calculating the correlation coefficient standardizes the variables, changes in scale or units of measurement will not affect its value.
numeric_df = df.select_dtypes(include=np.number).copy()
# scaler = StandardScaler()
# scaler.fit(numeric_df)
# scaled_df = scaler.transform(numeric_df)
df_corr = numeric_df.corr(method="pearson")
fig = px.imshow(df_corr, text_auto=True)
fig.show()
We can get some insights, for example:
- There is not any correlation between the revenue and the Rating. Therefore invest a lot of money in a movie doesn't mean a high quality movie.
We also can make some pair comparison in some specific graphs
fig = px.scatter_matrix(df,
dimensions=[
"Year",
"Votes",
"Revenue (Millions)",
"Rating",
],
)
fig.show()
We can analysis if the description provides us enought number of words in order to have a more high detailed information about the movies.
df.columns
Index(['Rank', 'Title', 'Description', 'Director', 'Actors', 'Year',
'Runtime (Minutes)', 'Rating', 'Votes', 'Revenue (Millions)',
'Metascore', 'Genre_is_Drama'],
dtype='object')
words = df["Description"].str.split().apply(len).value_counts()
fig = px.bar(words)
fig.show()
fig = px.ecdf(words)
fig.show()
As we can notice and the graph above, there are not much information in the description movies. What can dificult the train of our NLP models. And 50% of our movies have less than 15 words in the description.
Let's check the extreme values.
df.loc[df["Description"].str.split().apply(len) <= 10]
| Rank | Title | Description | Director | Actors | Year | Runtime (Minutes) | Rating | Votes | Revenue (Millions) | Metascore | Genre_is_Drama | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 256 | 634 | Megan Is Missing | Two teenage girls encounter an Internet child ... | Michael Goi | Amber Perkins, Rachel Quinn, Dean Waite, Jael ... | 2011 | 85 | 4.9 | 6683 | 82.877094 | 94.000000 | 1 |
| 533 | 525 | Trust | A teenage girl is targeted by an online sexual... | David Schwimmer | Clive Owen, Catherine Keener, Liana Liberato,J... | 2010 | 106 | 7.0 | 36043 | 0.060000 | 60.000000 | 1 |
| 551 | 524 | Whiskey Tango Foxtrot | A journalist recounts her wartime coverage in ... | Glenn Ficarra | Tina Fey, Margot Robbie, Martin Freeman, Alfre... | 2016 | 112 | 6.6 | 36156 | 82.877094 | 58.576613 | 1 |
df.loc[df["Description"].str.split().apply(len) >= 50]
| Rank | Title | Description | Director | Actors | Year | Runtime (Minutes) | Rating | Votes | Revenue (Millions) | Metascore | Genre_is_Drama | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 8 | 836 | The Loft | Five married guys conspire to secretly share a... | Erik Van Looy | Karl Urban, James Marsden, Wentworth Miller, E... | 2014 | 108 | 6.3 | 38804 | 5.980000 | 24.000000 | 0 |
| 59 | 898 | The Fall | In a hospital on the outskirts of 1920s Los An... | Tarsem Singh | Lee Pace, Catinca Untaru, Justine Waddell, Kim... | 2006 | 117 | 7.9 | 93036 | 2.280000 | 64.000000 | 1 |
| 106 | 735 | Mr. Church | "Mr. Church" tells the story of a unique frien... | Bruce Beresford | Eddie Murphy, Britt Robertson, Natascha McElho... | 2016 | 104 | 7.7 | 16163 | 0.690000 | 37.000000 | 1 |
| 178 | 232 | A Kind of Murder | In 1960s New York, Walter Stackhouse is a succ... | Andy Goddard | Patrick Wilson, Jessica Biel, Haley Bennett, V... | 2016 | 95 | 5.2 | 3305 | 0.000000 | 50.000000 | 1 |
| 196 | 323 | RocknRolla | In London, a real-estate scam puts millions of... | Guy Ritchie | Gerard Butler, Tom Wilkinson, Idris Elba, Than... | 2008 | 114 | 7.3 | 203096 | 5.690000 | 53.000000 | 0 |
| 254 | 155 | Twin Peaks: The Missing Pieces | Twin Peaks before Twin Peaks (1990) and at the... | David Lynch | Chris Isaak, Kiefer Sutherland, C.H. Evans, Sa... | 2014 | 91 | 8.1 | 1973 | 82.877094 | 58.576613 | 1 |
| 417 | 863 | Alexander and the Terrible, Horrible, No Good,... | Alexander's day begins with gum stuck in his h... | Miguel Arteta | Steve Carell, Jennifer Garner, Ed Oxenbould, D... | 2014 | 81 | 6.2 | 32310 | 66.950000 | 54.000000 | 0 |
| 498 | 324 | In Time | In a future where people stop aging at 25, but... | Andrew Niccol | Justin Timberlake, Amanda Seyfried, Cillian Mu... | 2011 | 109 | 6.7 | 319025 | 37.550000 | 53.000000 | 0 |
| 582 | 764 | Eagle Eye | Jerry and Rachel are two strangers thrown toge... | D.J. Caruso | Shia LaBeouf, Michelle Monaghan, Rosario Dawso... | 2008 | 118 | 6.6 | 156158 | 101.110000 | 43.000000 | 0 |
| 629 | 960 | Lucky Number Slevin | A case of mistaken identity lands Slevin into ... | Paul McGuigan | Josh Hartnett, Ben Kingsley, Morgan Freeman, L... | 2006 | 110 | 7.8 | 271940 | 22.490000 | 53.000000 | 1 |
Questions¶
Since is not possible predict the rating based on the description using the dataset above. Now we have another question. Which directors are safier to invest in movies in order to make good filmes ?
We will define a rating above 7 as a good movie, otherwise it will be a bad movie.
In that scenario we need calculate the average of his produced videos and select only the above rating 7.
To be fair with the directors who are starting the carrier, we will not consider them who have only 1 movie. Hence, they have the opportunity to produce one more movie before be selected automatically by this filter in future movies.
view = df.groupby("Director")["Rating"] \
.agg(["mean", "count"]) \
.query("count >= 2")
print(f"Number of directors with more than one movie: {len(view)}")
Number of directors with more than one movie: 117
view.query("mean >= 7")
| mean | count | |
|---|---|---|
| Director | ||
| Andrew Stanton | 7.466667 | 3 |
| Ang Lee | 7.100000 | 2 |
| Anthony Russo | 7.850000 | 2 |
| Antoine Fuqua | 7.000000 | 4 |
| Ben Affleck | 7.266667 | 3 |
| Bong Joon Ho | 7.000000 | 2 |
| Brad Bird | 7.300000 | 3 |
| Christopher McQuarrie | 7.200000 | 2 |
| Christopher Nolan | 8.633333 | 3 |
| Clint Eastwood | 7.766667 | 3 |
| Damien Chazelle | 8.400000 | 2 |
| Darren Aronofsky | 7.650000 | 2 |
| David Ayer | 7.166667 | 3 |
| David Fincher | 7.900000 | 3 |
| David Lynch | 7.550000 | 2 |
| David O. Russell | 7.375000 | 4 |
| David Yates | 7.420000 | 5 |
| Dean DeBlois | 8.000000 | 2 |
| Denis Villeneuve | 7.733333 | 3 |
| Duncan Jones | 7.700000 | 2 |
| Edgar Wright | 7.700000 | 2 |
| Ethan Coen | 7.333333 | 3 |
| Gabriele Muccino | 7.850000 | 2 |
| Gareth Edwards | 7.150000 | 2 |
| Guillermo del Toro | 7.200000 | 4 |
| Guy Ritchie | 7.366667 | 3 |
| J.A. Bayona | 7.550000 | 2 |
| J.J. Abrams | 7.700000 | 4 |
| James Gunn | 7.133333 | 3 |
| James Wan | 7.233333 | 3 |
| Jason Reitman | 7.450000 | 2 |
| Jean-Marc Vallée | 7.366667 | 3 |
| John Lee Hancock | 7.466667 | 3 |
| Jon Favreau | 7.400000 | 3 |
| Kathryn Bigelow | 7.500000 | 2 |
| Lone Scherfig | 7.000000 | 2 |
| Martin Scorsese | 7.850000 | 4 |
| Matthew Vaughn | 7.750000 | 2 |
| Mel Gibson | 8.000000 | 2 |
| Neil Burger | 7.050000 | 2 |
| Neill Blomkamp | 7.300000 | 2 |
| Nicolas Winding Refn | 7.033333 | 3 |
| Paul Greengrass | 7.400000 | 2 |
| Pete Docter | 8.250000 | 2 |
| Peter Berg | 7.450000 | 2 |
| Peter Jackson | 7.733333 | 3 |
| Phil Lord | 7.450000 | 2 |
| Quentin Tarantino | 7.733333 | 3 |
| Rajkumar Hirani | 8.300000 | 2 |
| Ron Howard | 7.150000 | 2 |
| Sam Mendes | 7.050000 | 2 |
| Shane Black | 7.300000 | 2 |
| Steve McQueen | 7.633333 | 3 |
| Steven Spielberg | 7.133333 | 3 |
| Tate Taylor | 7.300000 | 2 |
| Tom Hooper | 7.533333 | 3 |
| Tom Tykwer | 7.500000 | 2 |
| Wes Anderson | 7.800000 | 2 |
| Woody Allen | 7.166667 | 3 |
| Xavier Dolan | 7.550000 | 2 |
| Yorgos Lanthimos | 7.200000 | 2 |
Now we obtain a list of the directors that we must avoid when thinking about release a new movie.
view.query("mean < 7")
| mean | count | |
|---|---|---|
| Director | ||
| Adam McKay | 6.733333 | 3 |
| Alexandre Aja | 5.900000 | 2 |
| Andrew Niccol | 6.300000 | 2 |
| Anne Fletcher | 6.600000 | 2 |
| Ben Wheatley | 6.350000 | 2 |
| Christian Ditter | 6.650000 | 2 |
| D.J. Caruso | 5.800000 | 3 |
| Dennis Dugan | 5.825000 | 4 |
| Eli Roth | 5.150000 | 2 |
| Elizabeth Banks | 5.400000 | 2 |
| Francis Lawrence | 6.833333 | 3 |
| Frank Coraci | 5.650000 | 2 |
| George Tillman Jr. | 6.900000 | 2 |
| Glenn Ficarra | 6.866667 | 3 |
| Gore Verbinski | 6.966667 | 3 |
| Henry Joost | 6.050000 | 2 |
| Jaume Collet-Serra | 6.950000 | 2 |
| Joe Wright | 6.800000 | 2 |
| John Crowley | 6.850000 | 2 |
| John R. Leonetti | 5.000000 | 2 |
| Jon M. Chu | 6.350000 | 2 |
| Joseph Kosinski | 6.900000 | 2 |
| Justin Lin | 6.566667 | 3 |
| Kenneth Branagh | 6.600000 | 2 |
| Kevin Smith | 5.433333 | 3 |
| Lars von Trier | 6.850000 | 4 |
| Len Wiseman | 6.750000 | 2 |
| Louis Leterrier | 6.525000 | 4 |
| M. Night Shyamalan | 5.075000 | 4 |
| Marcus Nispel | 5.400000 | 2 |
| Martin Campbell | 6.800000 | 2 |
| McG | 6.366667 | 3 |
| Michael Bay | 6.360000 | 5 |
| Michael Mann | 6.200000 | 2 |
| Michael Patrick King | 4.900000 | 2 |
| Mikael Håfström | 6.750000 | 2 |
| Mike Flanagan | 6.400000 | 3 |
| Nicholas Stoller | 6.600000 | 3 |
| Paul Feig | 5.950000 | 2 |
| Paul McGuigan | 6.900000 | 2 |
| Paul W.S. Anderson | 5.820000 | 5 |
| Peyton Reed | 6.550000 | 2 |
| Rawson Marshall Thurber | 6.650000 | 2 |
| Richard LaGravenese | 6.933333 | 3 |
| Ridley Scott | 6.620000 | 5 |
| Robert Schwentke | 6.000000 | 2 |
| Robert Zemeckis | 6.866667 | 3 |
| Roland Emmerich | 6.100000 | 2 |
| Sam Raimi | 6.250000 | 2 |
| Scott Cooper | 6.850000 | 2 |
| Scott Hicks | 6.050000 | 2 |
| Sylvester Stallone | 6.933333 | 3 |
| Tim Burton | 6.600000 | 2 |
| Wes Ball | 6.550000 | 2 |
| Will Gluck | 5.950000 | 2 |
| Zack Snyder | 6.800000 | 3 |